Student Name: Kimon Iliopoulos
Project Title: Worldwide life expectancy and it’s factors from 2000 to 2015
import pandas as pd
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso
import missingno as msno
import plotly
import plotly.graph_objs as go
from scipy.stats import norm
from sklearn.impute import SimpleImputer
from scipy.stats.mstats import winsorize
import warnings
warnings.filterwarnings('ignore')
from sklearn.cluster import KMeans
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from scipy.stats import probplot
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.regression.linear_model as sm
from mpl_toolkits.mplot3d import Axes3D
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm
import plotly.express as px
dfExp = pd.read_csv('Life Expectancy Data.csv')
dfSan = pd.read_csv('Mortality Sanitation Dataset.csv')
dfSan = dfSan[['Data Source','Unnamed: 60']]
dfSan = dfSan.drop(dfSan.index[0:4])
dfSan = dfSan.rename(columns={'Data Source': 'Country', 'Unnamed: 60': 'Sanitation Mortality/100000(2016)'})
newdf = pd.merge(dfExp, dfSan, on='Country', how='right')
newSan = pd.DataFrame(zip(dfSan.Country))
newSan = newSan.drop_duplicates(keep='first')
newExp = pd.DataFrame(zip(dfExp.Country))
newExp = newExp.drop_duplicates(keep='first')
dfExpSan = newSan.merge(newExp, how='left', indicator=True)
dfExpSan['_merge'] = dfExpSan['_merge'].map({'left_only': False, 'both': True})
dfExpSan = dfExpSan.rename(columns={"_merge": "Common Countries"})
newSan = newSan.replace({'Congo, Dem. Rep.':'Democratic Republic of the Congo', 'Congo, Rep.':'Congo',
'Egypt, Arab Rep.':'Egypt', 'Micronesia, Fed. Sts.':'Micronesia (Federated States of)',
'Gambia, The':'Gambia', 'Kyrgyz Republic':'Kyrgyzstan', 'St. Kitts and Nevis':'Saint Kitts and Nevis',
'Korea, Rep.':'Republic of Korea', 'Korea, Dem. People’s Rep.':"Democratic People's Republic of Korea",
'Slovak Republic':'Slovakia', 'United States':'United States of America',
'St. Vincent and the Grenadines':'Saint Vincent and the Grenadines',
'Venezuela, RB':'Venezuela (Bolivarian Republic of)', 'Yemen, Rep.':'Yemen', 'Bahamas, The':'Bahamas'})
newExp = newExp.replace({'Samoa':'American Samoa', 'Bolivia (Plurinational State of)':'Bolivia',
'Czechia':'Czech Republic', 'United Kingdom of Great Britain and Northern Ireland':'United Kingdom',
'Iran (Islamic Republic of)':'Iran, Islamic Rep.'})
dfExpSan = newSan.merge(newExp, how='left', indicator=True)
dfExpSan['_merge'] = dfExpSan['_merge'].map({'left_only': False, 'both': True})
dfExpSan = dfExpSan.rename(columns={"_merge": "Common Countries"})
dfExpSan = dfExpSan[dfExpSan['Common Countries'] == True]
dfExpSan = dfExpSan.rename(columns={0: "Country"})
dfSan = dfSan.merge(dfExpSan, how='left', on = 'Country')
dfSan = dfSan[dfSan['Common Countries'] == True]
dfExp = dfExp.merge(dfSan, how='left', on = 'Country')
dfExp = dfExp[dfExp['Common Countries'] == True]
dfCountryCodes = pd.read_csv('all.csv')
dfCountryCodes = dfCountryCodes[['name', 'alpha-3', 'region', 'sub-region']]
dfCountryCodes = dfCountryCodes.rename(columns={'name': "Country"})
dfExp = dfExp.merge(dfCountryCodes, how='left', on = 'Country')
dfExp = dfExp[dfExp.groupby('Country').Country.transform(len) > 1]
dfExp.rename(columns={'region' : 'Region', 'sub-region' : 'Sub-region', 'alpha-3' : 'Country code'}, inplace = True)
del(dfExp['Common Countries'])
dfExp = dfExp[dfExp.groupby('Country').Country.transform(len) > 1]
dfExp.rename(columns={'region' : 'Region', 'sub-region' : 'Sub-region', 'alpha-3' : 'Country code'}, inplace = True)
dfExp.describe()
dfExp.head()
dfExp.tail()
categoricalFeats = dfExp.select_dtypes(include=[np.object])
categoricalFeats.columns
numericalFeats = dfExp.select_dtypes(include=[np.number])
numericalFeats.columns
msno.matrix(dfExp)
msno.heatmap(dfExp)
msno.bar(dfExp)
numericalFeatsMissingValues = numericalFeats.isnull().sum().sort_values(ascending=False)
numericalFeatsMissingValuesPerc = ((numericalFeats.isnull().sum()/numericalFeats.isnull().count())*100).sort_values(ascending=False)
numericalFeatsMissingData = pd.concat([numericalFeatsMissingValues, numericalFeatsMissingValuesPerc], axis=1,join='outer', keys=['Total Missing Numerical Values', '% of Total Observations'])
numericalFeatsMissingData.index.name =' Numerical Feature'
numericalFeatsMissingData
categoricalFeatsMissingValues = categoricalFeats.isnull().sum().sort_values(ascending=False)
categoricalFeatsMissingValuesPerc = (categoricalFeats.isnull().sum()/categoricalFeats.isnull().count()).sort_values(ascending=False)
categoricalFeatsMissingData = pd.concat([categoricalFeatsMissingValues, categoricalFeatsMissingValuesPerc], axis=1,join='outer', keys=['Total Missing Categorical Values', '% of Total Observations'])
categoricalFeatsMissingData.index.name =' Categorical Feature'
categoricalFeatsMissingData
dfExpImputed = numericalFeats.copy()
dfExpImputed = pd.concat([categoricalFeats, dfExpImputed], axis=1)
for column in dfExpImputed.columns[5:]:
dfExpImputed[column] = dfExpImputed.groupby('Country').transform(lambda x: x.fillna(x.mean()))[column]
print('Missing values after fillna based on country')
dfExpImputedMissingValues = dfExpImputed.isnull().sum().sort_values(ascending=False)
dfExpImputedMissingValuesPerc = (dfExpImputed.isnull().sum()/dfExpImputed.isnull().count()).sort_values(ascending=False)
dfExpImputedMissingData = pd.concat([dfExpImputedMissingValues, dfExpImputedMissingValuesPerc], axis=1,join='outer', keys=['Total Missing Values', '% of Total Observations'])
dfExpImputedMissingData
for column in dfExpImputed.columns[5:]:
dfExpImputed[column] = dfExpImputed.groupby('Sub-region').transform(lambda x: x.fillna(x.mean()))[column]
print('Missing values after fillna based on subregion')
dfExpImputedMissingValues = dfExpImputed.isnull().sum().sort_values(ascending=False)
dfExpImputedMissingValuesPerc = (dfExpImputed.isnull().sum()/dfExpImputed.isnull().count()).sort_values(ascending=False)
dfExpImputedMissingData = pd.concat([dfExpImputedMissingValues, dfExpImputedMissingValuesPerc], axis=1,join='outer', keys=['Total Missing Values', '% of Total Observations'])
dfExpImputedMissingData
dfExpImputed['Year'] = dfExpImputed['Year'].astype('int64')
dfExpImputed['Measles '] = dfExpImputed['Measles '].astype('int64')
dfExpImputed['under-five deaths '] = dfExpImputed['under-five deaths '].astype('int64')
dfExpImputed['infant deaths'] = dfExpImputed['infant deaths'].astype('int64')
categoricalImputedFeats = dfExpImputed.select_dtypes(include=[np.object])
categoricalImputedFeats.columns
numericalImputedFeats = dfExpImputed.select_dtypes(include=[np.number])
numericalImputedFeats.columns
columns = numericalImputedFeats.drop('Year', axis = 1)
plt.figure(figsize=(20, 20))
i = 0
for feature in columns:
i += 1
plt.subplot(10, 4, i)
plt.tight_layout()
plt.boxplot(numericalImputedFeats[feature])
plt.title('{} boxplot'.format(feature))
i += 1
plt.subplot(10, 4, i )
plt.tight_layout()
plt.hist(numericalImputedFeats[feature])
plt.title('{} histogram'.format(feature))
plt.show()
winsorizedNumericalImputedFeats = numericalImputedFeats.copy()
winsorizedNumericalImputedFeats['Life expectancy '] = winsorize(numericalImputedFeats['Life expectancy '], (0.01, 0))
winsorizedNumericalImputedFeats['Adult Mortality'] = winsorize(numericalImputedFeats['Adult Mortality'], (0, 0.03))
winsorizedNumericalImputedFeats['infant deaths'] = winsorize(numericalImputedFeats['infant deaths'], (0, 0.12))
winsorizedNumericalImputedFeats['Alcohol'] = winsorize(numericalImputedFeats['Alcohol'], (0, 0.01))
winsorizedNumericalImputedFeats['percentage expenditure'] = winsorize(numericalImputedFeats['percentage expenditure'], (0, 0.14))
winsorizedNumericalImputedFeats['Hepatitis B'] = winsorize(numericalImputedFeats['Hepatitis B'], (0.11, 0))
winsorizedNumericalImputedFeats['Measles '] = winsorize(numericalImputedFeats['Measles '], (0, 0.19))
winsorizedNumericalImputedFeats['under-five deaths '] = winsorize(numericalImputedFeats['under-five deaths '], (0, 0.145))
winsorizedNumericalImputedFeats['Polio'] = winsorize(numericalImputedFeats['Polio'], (0.1, 0))
winsorizedNumericalImputedFeats['Total expenditure'] = winsorize(numericalImputedFeats['Total expenditure'], (0, 0.01))
winsorizedNumericalImputedFeats['Diphtheria '] = winsorize(numericalImputedFeats['Diphtheria '], (0.11, 0))
winsorizedNumericalImputedFeats[' HIV/AIDS'] = winsorize(numericalImputedFeats[' HIV/AIDS'], (0, 0.185))
winsorizedNumericalImputedFeats['GDP'] = winsorize(numericalImputedFeats['GDP'], (0, 0.155))
winsorizedNumericalImputedFeats['Population'] = winsorize(numericalImputedFeats['Population'], (0, 0.14))
winsorizedNumericalImputedFeats[' thinness 1-19 years'] = winsorize(numericalImputedFeats[' thinness 1-19 years'], (0, 0.04))
winsorizedNumericalImputedFeats[' thinness 5-9 years'] = winsorize(numericalImputedFeats[' thinness 5-9 years'], (0, 0.04))
winsorizedNumericalImputedFeats['Income composition of resources'] = winsorize(numericalImputedFeats['Income composition of resources'], (0.05, 0))
winsorizedNumericalImputedFeats['Schooling'] = winsorize(numericalImputedFeats['Schooling'], (0.02, 0.01))
winsorizedNumericalImputedFeats['Sanitation Mortality/100000(2016)'] = winsorize(numericalImputedFeats['Sanitation Mortality/100000(2016)'], (0, 0.099))
plt.figure(figsize=(20, 20))
i = 0
for feature in winsorizedNumericalImputedFeats.drop('Year', axis = 1):
i += 1
plt.subplot(10, 4, i)
plt.tight_layout()
plt.boxplot(winsorizedNumericalImputedFeats[feature])
plt.title('{} boxplot'.format(feature))
i += 1
plt.subplot(10, 4, i )
plt.tight_layout()
plt.hist(winsorizedNumericalImputedFeats[feature])
plt.title('{} histogram'.format(feature))
plt.show()
winsorizedImputedFeats = pd.concat([categoricalImputedFeats, winsorizedNumericalImputedFeats], axis=1)
columns = winsorizedNumericalImputedFeats.columns.drop(['Life expectancy ', 'Year'])
plt.figure(figsize = (20, 20))
for i in range(len(columns)):
plt.subplot(4, 5, i + 1)
plt.scatter(winsorizedNumericalImputedFeats['Life expectancy '], winsorizedNumericalImputedFeats[columns[i]])
plt.title(columns[i])
Corr1 = winsorizedNumericalImputedFeats.drop('Year', axis = 1).corr()
plt.figure(figsize = (12, 10))
sns.heatmap(Corr1, annot = True)
corrLifeExp = abs(Corr1["Life expectancy "])
bestFeatures = corrLifeExp[corrLifeExp>0.5]
bestFeatures = bestFeatures.drop('Life expectancy ')
bestFeatures.sort_values()
Corr2 = winsorizedNumericalImputedFeats[['Adult Mortality', ' HIV/AIDS', 'Income composition of resources', 'Schooling',
'Sanitation Mortality/100000(2016)']].corr()
plt.figure(figsize = (12, 10))
sns.heatmap(Corr2, annot = True)
winsorizedImputedSelectedFeats = winsorizedImputedFeats.drop(['infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B',
'Measles ', ' BMI ', 'GDP', 'Population', ' thinness 1-19 years',
' thinness 5-9 years', 'under-five deaths ', 'Polio',
'Diphtheria ', 'Total expenditure'], axis = 1)
winsorizedImputedCatSelectedFeats = winsorizedImputedSelectedFeats.select_dtypes(include=[np.object])
winsorizedImputedCNumSelectedFeats = winsorizedImputedSelectedFeats.select_dtypes(include=[np.number])
plt.figure(figsize = (12, 5))
plt.subplot(1, 2, 1)
sns.distplot(numericalImputedFeats.skew(),color='green',axlabel ='Skewness')
plt.title('Skewness before winsorizing the outliers')
plt.subplot(1, 2, 2)
sns.distplot(winsorizedNumericalImputedFeats.skew(),color='green',axlabel ='Skewness')
plt.title('Skewness after winsorizing the outliers')
plt.figure(figsize = (12, 5))
plt.subplot(1, 2, 1)
sns.distplot(numericalImputedFeats.kurt(),color='red',axlabel ='Kurtosis')
plt.title('Kurtosis before winsorizing the outliers')
plt.subplot(1, 2, 2)
sns.distplot(winsorizedNumericalImputedFeats.kurt(),color='red',axlabel ='Kurtosis')
plt.title('Kurtosis after winsorizing the outliers')
columns = ['Life expectancy ', 'Adult Mortality', 'infant deaths',
'Alcohol', 'percentage expenditure', 'Hepatitis B', 'Measles ', ' BMI ',
'under-five deaths ', 'Polio', 'Total expenditure', 'Diphtheria ',
' HIV/AIDS', 'GDP', 'Population', ' thinness 1-19 years',
' thinness 5-9 years', 'Income composition of resources', 'Schooling',
'Sanitation Mortality/100000(2016)']
plt.figure(figsize = (14, 30))
counter = 0
for i in range(len(columns)):
counter += 1
plt.subplot(10, 4, counter)
sns.distplot(numericalImputedFeats[columns[i]], fit=norm)
plt.subplots_adjust(hspace = 0.8)
plt.title('Before winsorizing')
counter += 1
plt.subplot(10, 4, counter)
sns.distplot(winsorizedNumericalImputedFeats[columns[i]], fit=norm);
plt.subplots_adjust(hspace = 0.8)
plt.title('After winsorizing')
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
LifeExpCountry = winsorizedImputedFeats.groupby('Country')['Life expectancy '].mean().sort_values()
LifeExpCountry.plot(kind='bar', figsize=(60,20), fontsize=25, color = colors )
plt.title("Life Expectancy for every Country",fontsize=40)
plt.xlabel("Country",fontsize=35)
plt.ylabel("Average Life Expectancy",fontsize=35)
plt.show()
lifeExpRegionYear = pd.DataFrame(dfExp.groupby(['Region','Year'])['Life expectancy '].mean())
lifeExpRegionYear.reset_index(inplace = True)
axAfrica = lifeExpRegionYear.loc[lifeExpRegionYear['Region'] =='Africa']
axAmericas = lifeExpRegionYear.loc[lifeExpRegionYear['Region'] =='Americas']
axAsia = lifeExpRegionYear.loc[lifeExpRegionYear['Region'] =='Asia']
axEurope = lifeExpRegionYear.loc[lifeExpRegionYear['Region'] =='Europe']
axOceania = lifeExpRegionYear.loc[lifeExpRegionYear['Region'] =='Oceania']
plt.figure(figsize=(12, 6), dpi=80)
plt.xlabel('Year')
plt.ylabel('Life Expectancy')
plt.suptitle('Life Expectancy per region for 2000-2015')
plt.plot(axAfrica['Year'], axAfrica['Life expectancy '], label='Africa', linewidth=3)
plt.plot(axAmericas['Year'], axAmericas['Life expectancy '], label='Americas', linewidth=3)
plt.plot(axAsia['Year'], axAsia['Life expectancy '], label='Asia', linewidth=3)
plt.plot(axEurope['Year'], axEurope['Life expectancy '], label='Europe', linewidth=3)
plt.plot(axOceania['Year'], axOceania['Life expectancy '], label='Oceania', linewidth=3)
plt.legend(loc='center left')
plt.xticks(rotation=75)
LifeExp2000LifeExp2015 = pd.DataFrame(winsorizedImputedFeats.groupby([('Country')], sort=False).apply(lambda x: (x['Life expectancy '].values[0] - x['Life expectancy '].values[-1]) / (x['Life expectancy '].values[0])))
LifeExp2000LifeExp2015.reset_index(level=0, inplace=True)
LifeExp2000LifeExp2015.rename(columns={ LifeExp2000LifeExp2015.columns[1]: 'LifeExpChange'}, inplace = True)
x, y = (list(x) for x in zip(*sorted(zip(LifeExp2000LifeExp2015['LifeExpChange'], LifeExp2000LifeExp2015['Country']),
reverse = True)))
sns.set(font_scale=1)
f, ax = plt.subplots(figsize=(20, 60))
sns.barplot(x, y, palette="vlag")
Text = ax.set(xlabel='Decrease in Life Expectancy from 2000-2015',
title='Increase in Life Expectancy from 2000-2015')
lifeExpSyria = winsorizedImputedFeats[winsorizedImputedFeats['Country'] == 'Syrian Arab Republic']
plt.figure(figsize=(10, 7))
ax = sns.barplot(x = lifeExpSyria['Year'], y = lifeExpSyria['Life expectancy '], palette="rocket")
plt.xticks(rotation = 45)
ax.set(xlabel='Years', ylabel = 'Life Expectancy', title='Syrian Arab Republic')
plt.figure(figsize=(12, 6), dpi=80)
sns.distplot(winsorizedImputedFeats[winsorizedImputedFeats['Status']=='Developed']['Life expectancy '], color = 'k')
sns.distplot(winsorizedImputedFeats[winsorizedImputedFeats['Status']=='Developing']['Life expectancy '],color='y')
labels=['Developed Countries','Developing Countries']
plt.legend(labels=labels,bbox_to_anchor=(1, 1))
plt.show()
developedCountries = winsorizedImputedFeats[winsorizedImputedFeats['Status'] == 'Developed']
developingCountries = winsorizedImputedFeats[winsorizedImputedFeats['Status'] == 'Developing']
figure = {'data': [{'x': developedCountries['Status'],
'y': developedCountries['Life expectancy '], 'text': developedCountries['Country'], 'mode': 'markers', 'name': 'Developed Countries'},
{'x': developingCountries['Status'], 'y': developingCountries['Life expectancy '], 'text': developingCountries['Country'], 'mode': 'markers', 'name': 'Developing Countries'}],
'layout': {'xaxis': {'title': 'Status'}, 'yaxis': {'title': "Life Expectancy"}}}
plotly.offline.iplot(figure)
developedMeanLifExp = pd.DataFrame(developedCountries.groupby('Year').apply(lambda x: ((x['Life expectancy '].pct_change(periods = 1)).mean())))
developedMeanLifExp.reset_index(level=0, inplace=True)
developedMeanLifExp.rename(columns={developedMeanLifExp.columns[1]: 'LifeExpChange'}, inplace = True)
developingMeanLifExp = pd.DataFrame(developingCountries.groupby('Year').apply(lambda x: ((x['Life expectancy '].pct_change(periods = 1)).mean())))
developingMeanLifExp.reset_index(level=0, inplace=True)
developingMeanLifExp.rename(columns={developingMeanLifExp.columns[1]: 'LifeExpChange'}, inplace = True)
# Life expectancy
plt.figure(figsize=(20, 20))
plt.subplot(4, 2, 1)
plt.tight_layout()
ax1 = sns.barplot(x = developedCountries['Year'], y = developedCountries['Life expectancy '], palette="rocket")
plt.xticks(rotation = 45)
Text = ax1.set(xlabel='Years', ylabel = 'Life Expectancy', title='Developed Countries')
plt.subplot(4, 2, 2, sharey = ax1)
plt.tight_layout()
ax2 = sns.barplot(x = developingCountries['Year'], y = developingCountries['Life expectancy '], palette="rocket")
plt.xticks(rotation = 45)
Text = ax2.set(xlabel='Years', ylabel = 'Life Expectancy', title='Developing Countries')
# Income composition of resources
plt.subplot(4, 2, 3)
plt.tight_layout()
ax1 = sns.barplot(x = developedCountries['Year'], y = developedCountries['Income composition of resources'], palette="rocket")
plt.xticks(rotation = 45)
Text = ax1.set(xlabel='Years', ylabel = 'Income composition of resources', title='Developed Countries')
plt.subplot(4, 2, 4, sharey = ax1)
plt.tight_layout()
ax2 = sns.barplot(x = developingCountries['Year'], y = developingCountries['Income composition of resources'], palette="rocket")
plt.xticks(rotation = 45)
Text = ax2.set(xlabel='Years', ylabel = 'Income composition of resources', title='Developing Countries')
# Schooling
plt.subplot(4, 2, 5)
plt.tight_layout()
sns.set(font_scale = 1)
ax1 = sns.barplot(x = developedCountries['Year'], y = developedCountries['Schooling'], palette="rocket")
plt.xticks(rotation = 45)
Text = ax1.set(xlabel='Years', ylabel = 'Schooling', title='Developed Countries')
plt.subplot(4, 2, 6, sharey = ax1)
plt.tight_layout()
ax2 = sns.barplot(x = developingCountries['Year'], y = developingCountries['Schooling'], palette="rocket")
plt.xticks(rotation = 45)
Text = ax2.set(xlabel='Years', ylabel = 'Schooling', title='Developing Countries')
# Adult Mortality
plt.subplot(4, 2, 7)
plt.tight_layout()
ax1 = sns.barplot(x = developedCountries['Year'], y = developedCountries['Adult Mortality'], palette="rocket")
plt.xticks(rotation = 45)
Text = ax1.set(xlabel='Years', ylabel = 'Adult Mortality', title='Developed Countries')
plt.subplot(4, 2, 8, sharey = ax1)
plt.tight_layout()
ax2 = sns.barplot(x = developingCountries['Year'], y = developingCountries['Adult Mortality'], palette="rocket")
plt.xticks(rotation = 45)
Text = ax2.set(xlabel='Years', ylabel = 'Adult Mortality', title='Developing Countries')
plt.figure(figsize=(100, 25))
plt.subplot(1, 3, 1)
ax1 = sns.barplot(x = developedMeanLifExp['Year'], y = developedMeanLifExp['LifeExpChange'], palette="vlag")
plt.xticks(rotation = 45)
Text = ax1.set(xlabel='Years', ylabel = 'Life Expectancy Change', title='Developed Countries')
plt.subplot(1, 3, 2, sharey = ax1)
ax2 = sns.barplot(x = developingMeanLifExp['Year'], y = developingMeanLifExp['LifeExpChange'], palette="vlag")
plt.xticks(rotation = 45)
Text = ax2.set(xlabel='Years', ylabel = 'Life Expectancy Change', title='Developing Countries')
fig = plt.figure(figsize=(25, 10))
ax1 = plt.subplot(121, projection='3d')
ax1.scatter(developedCountries['Income composition of resources'], developedCountries['Adult Mortality'], developedCountries['Schooling'], edgecolors="red")
ax1.set_xlabel('Income composition of resources')
ax1.set_ylabel('Adult Mortality')
ax1.set_zlabel('Schooling')
ax1.set_title('DEVELOPED')
ax2 = plt.subplot(122, projection='3d')
ax2.scatter(developingCountries['Income composition of resources'], developingCountries['Adult Mortality'], developingCountries['Schooling'], edgecolors="red")
ax2.set_xlabel('Income composition of resources')
ax2.set_ylabel('Adult Mortality')
ax2.set_zlabel('Schooling')
ax2.set_title('DEVELOPING')
winsorizedImputedFeats['Life expectancy '].groupby(winsorizedImputedFeats['Status']).describe()
bins = [40, 55, 70, 90]
classes = [3,2,1]
winsorizedImputedSelectedFeats['World class'] = pd.cut(winsorizedImputedSelectedFeats['Life expectancy '], bins=bins, labels = classes)
winsorizedImputedFeats['World class'] = pd.cut(winsorizedImputedSelectedFeats['Life expectancy '], bins=bins, labels = classes)
winsorizedImputedCNumSelectedFeats = winsorizedImputedSelectedFeats.select_dtypes(include=[np.number])
plt.figure(figsize=(12, 6), dpi=80)
sns.distplot(winsorizedImputedSelectedFeats[winsorizedImputedSelectedFeats['World class'] == 1]['Life expectancy '], color = 'k')
sns.distplot(winsorizedImputedSelectedFeats[winsorizedImputedSelectedFeats['World class'] == 2]['Life expectancy '], color = 'y')
sns.distplot(winsorizedImputedSelectedFeats[winsorizedImputedSelectedFeats['World class'] == 3]['Life expectancy '], color = 'r')
labels=['1st World','2nd World', '3rd World']
plt.legend(labels=labels,bbox_to_anchor=(1, 1))
plt.show()
plt.figure(figsize=(12, 5), dpi=80)
plt.subplot(131)
stats.probplot(winsorizedImputedSelectedFeats[winsorizedImputedSelectedFeats['World class'] == 1]['Life expectancy '], dist="norm", plot=plt)
plt.title('Lifespan-1stWorld-QQplot')
print(stats.shapiro(winsorizedImputedSelectedFeats['Life expectancy ']))
plt.subplot(132)
stats.probplot(winsorizedImputedSelectedFeats[winsorizedImputedSelectedFeats['World class'] == 2]['Life expectancy '], dist="norm", plot=plt)
plt.title('Lifespan-2ndWorld-QQplot')
print(stats.shapiro(winsorizedImputedSelectedFeats['Life expectancy ']))
plt.subplot(133)
stats.probplot(winsorizedImputedSelectedFeats[winsorizedImputedSelectedFeats['World class'] == 3]['Life expectancy '], dist="norm", plot=plt)
plt.title('Lifespan-3rdWorld-QQplot')
print(stats.shapiro(winsorizedImputedSelectedFeats['Life expectancy ']))
figure = {"data": [{"values": [len(winsorizedImputedFeats[(winsorizedImputedFeats['Region'] =='Asia' ) & (winsorizedImputedFeats['World class'] == 1)]),
len(winsorizedImputedFeats[(winsorizedImputedFeats["Region"] == 'Europe') & (winsorizedImputedFeats["World class"] == 1)]),
len(winsorizedImputedFeats[(winsorizedImputedFeats["Region"] == 'Africa') & (winsorizedImputedFeats["World class"] == 1)]),
len(winsorizedImputedFeats[(winsorizedImputedFeats["Region"] == 'Americas') & (winsorizedImputedFeats["World class"] == 1)]),
len(winsorizedImputedFeats[(winsorizedImputedFeats["Region"] == 'Oceania') & (winsorizedImputedFeats["World class"] == 1)])],
"labels" : ['Asia', 'Europe', 'Africa', 'Americas', 'Oceania'],
"domain": {"x": [0, 0.33]},
"name": "First World",
"hoverinfo":"label+percent+name",
"hole": 0.25,
"type": "pie"
}, {"values": [len(winsorizedImputedFeats[(winsorizedImputedFeats['Region'] =='Asia' ) & (winsorizedImputedFeats['World class'] == 2)]),
len(winsorizedImputedFeats[(winsorizedImputedFeats["Region"] == 'Europe') & (winsorizedImputedFeats["World class"] == 2)]),
len(winsorizedImputedFeats[(winsorizedImputedFeats["Region"] == 'Africa') & (winsorizedImputedFeats["World class"] == 2)]),
len(winsorizedImputedFeats[(winsorizedImputedFeats["Region"] == 'Americas') & (winsorizedImputedFeats["World class"] == 2)]),
len(winsorizedImputedFeats[(winsorizedImputedFeats["Region"] == 'Oceania') & (winsorizedImputedFeats["World class"] == 2)])],
"labels" : ['Asia', 'Europe', 'Africa', 'Americas', 'Oceania'],
"domain": {"x": [0.34, 0.66]},
"name": "Second World",
"hoverinfo":"label+percent+name",
"hole": 0.25,
"type": "pie"},
{"values": [len(winsorizedImputedFeats[(winsorizedImputedFeats['Region'] =='Asia' ) & (winsorizedImputedFeats['World class'] == 3)]),
len(winsorizedImputedFeats[(winsorizedImputedFeats["Region"] == 'Europe') & (winsorizedImputedFeats["World class"] == 3)]),
len(winsorizedImputedFeats[(winsorizedImputedFeats["Region"] == 'Africa') & (winsorizedImputedFeats["World class"] == 3)]),
len(winsorizedImputedFeats[(winsorizedImputedFeats["Region"] == 'Americas') & (winsorizedImputedFeats["World class"] == 3)]),
len(winsorizedImputedFeats[(winsorizedImputedFeats["Region"] == 'Oceania') & (winsorizedImputedFeats["World class"] == 3)])],
"labels" : ['Asia', 'Europe', 'Africa', 'Americas', 'Oceania'],
"domain": {"x": [0.67, 1]},
"name": "Third World",
"hoverinfo":"label+percent+name",
"hole": 0.25,
"type": "pie"}],
"layout": {
"title":"Percentage of each region for every World class",
"annotations": [
{
"font": {
"size": 20
},
"showarrow": False,
"text": "First World Countries",
"x": 0,
"y": 0
},
{
"font": {
"size": 20
},
"showarrow": False,
"text": "Second World Countries",
"x": 0.5,
"y": 0
},
{
"font": {
"size": 20
},
"showarrow": False,
"text": "Third World Countries",
"x": 1,
"y": 0
}
]
}
}
plotly.offline.iplot(figure, filename='donut')
firstWorldCountries = winsorizedImputedFeats[winsorizedImputedFeats['World class'] == 1]
secondWorldCountries = winsorizedImputedFeats[winsorizedImputedFeats['World class'] == 2]
thirdWorldCountries = winsorizedImputedFeats[winsorizedImputedFeats['World class'] == 3]
fig = plt.figure(figsize=(30, 8))
ax1 = plt.subplot(131, projection='3d')
ax1.scatter(firstWorldCountries['Income composition of resources'], firstWorldCountries['Adult Mortality'], firstWorldCountries['Schooling'], edgecolors="red")
ax1.set_xlabel('Income composition of resources')
ax1.set_ylabel('Adult Mortality')
ax1.set_zlabel('Schooling')
ax1.set_title('FIRST WORLD')
ax2 = plt.subplot(132, projection='3d')
ax2.scatter(secondWorldCountries['Income composition of resources'], secondWorldCountries['Adult Mortality'], secondWorldCountries['Schooling'], edgecolors="red")
ax2.set_xlabel('Income composition of resources')
ax2.set_ylabel('Adult Mortality')
ax2.set_zlabel('Schooling')
ax2.set_title('SECOND WORLD')
ax3 = plt.subplot(133, projection='3d')
ax3.scatter(thirdWorldCountries['Income composition of resources'], thirdWorldCountries['Adult Mortality'], thirdWorldCountries['Schooling'], edgecolors="red")
ax3.set_xlabel('Income composition of resources')
ax3.set_ylabel('Adult Mortality')
ax3.set_zlabel('Schooling')
ax3.set_title('THIRD WORLD')
FirstWCMeanLifExp = pd.DataFrame(firstWorldCountries.groupby('Year').apply(lambda x: ((x['Life expectancy '].pct_change(periods = 1)).mean())))
FirstWCMeanLifExp.reset_index(level=0, inplace=True)
FirstWCMeanLifExp.rename(columns={FirstWCMeanLifExp.columns[1]: 'LifeExpChange'}, inplace = True)
SecondWCMeanLifExp = pd.DataFrame(secondWorldCountries.groupby('Year').apply(lambda x: ((x['Life expectancy '].pct_change(periods = 1)).mean())))
SecondWCMeanLifExp.reset_index(level=0, inplace=True)
SecondWCMeanLifExp.rename(columns={ SecondWCMeanLifExp.columns[1]: 'LifeExpChange'}, inplace = True)
ThirdWCMeanLifExp = pd.DataFrame(thirdWorldCountries.groupby('Year').apply(lambda x: ((x['Life expectancy '].pct_change(periods = 1)).mean())))
ThirdWCMeanLifExp.reset_index(level=0, inplace=True)
ThirdWCMeanLifExp.rename(columns={ThirdWCMeanLifExp.columns[1]: 'LifeExpChange'}, inplace = True)
# Life expectancy
plt.figure(figsize=(20, 20))
plt.subplot(4, 3, 1)
plt.tight_layout()
ax1 = sns.barplot(x = firstWorldCountries['Year'], y = firstWorldCountries['Life expectancy '], palette="rocket")
plt.xticks(rotation = 45)
Text = ax1.set(xlabel='Years', ylabel = 'Life Expectancy', title='First World Countries')
plt.subplot(4, 3, 2, sharey = ax1)
plt.tight_layout()
ax2 = sns.barplot(x = secondWorldCountries['Year'], y = secondWorldCountries['Life expectancy '], palette="rocket")
plt.xticks(rotation = 45)
Text = ax2.set(xlabel='Years', ylabel = 'Life Expectancy', title='Second World Countries')
plt.subplot(4, 3, 3, sharey = ax1)
plt.tight_layout()
ax2 = sns.barplot(x = thirdWorldCountries['Year'], y = thirdWorldCountries['Life expectancy '], palette="rocket")
plt.xticks(rotation = 45)
Text = ax2.set(xlabel='Years', ylabel = 'Life Expectancy', title='Third World Countries')
# Income composition of resources
plt.subplot(4, 3, 4)
plt.tight_layout()
ax1 = sns.barplot(x = firstWorldCountries['Year'], y = firstWorldCountries['Income composition of resources'], palette="rocket")
plt.xticks(rotation = 45)
Text = ax1.set(xlabel='Years', ylabel = 'Income composition of resources', title='First World Countries')
plt.subplot(4, 3, 5, sharey = ax1)
plt.tight_layout()
ax2 = sns.barplot(x = secondWorldCountries['Year'], y = secondWorldCountries['Income composition of resources'], palette="rocket")
plt.xticks(rotation = 45)
Text = ax2.set(xlabel='Years', ylabel = 'Income composition of resources', title='Second World Countries')
plt.subplot(4, 3, 6, sharey = ax1)
plt.tight_layout()
ax2 = sns.barplot(x = thirdWorldCountries['Year'], y = thirdWorldCountries['Income composition of resources'], palette="rocket")
plt.xticks(rotation = 45)
Text = ax2.set(xlabel='Years', ylabel = 'Income composition of resources', title='Third World Countries')
# Schooling
plt.subplot(4, 3, 7)
plt.tight_layout()
ax1 = sns.barplot(x = firstWorldCountries['Year'], y = firstWorldCountries['Schooling'], palette="rocket")
plt.xticks(rotation = 45)
Text = ax1.set(xlabel='Years', ylabel = 'Schooling', title='First World Countries')
plt.subplot(4, 3, 8, sharey = ax1)
plt.tight_layout()
ax2 = sns.barplot(x = secondWorldCountries['Year'], y = secondWorldCountries['Schooling'], palette="rocket")
plt.xticks(rotation = 45)
Text = ax2.set(xlabel='Years', ylabel = 'Schooling', title='Second World Countries')
plt.subplot(4, 3, 9, sharey = ax1)
plt.tight_layout()
ax1 = sns.barplot(x = thirdWorldCountries['Year'], y = thirdWorldCountries['Schooling'], palette="rocket")
plt.xticks(rotation = 45)
Text = ax1.set(xlabel='Years', ylabel = 'Schooling', title='Third World Countries')
# Adult Mortality
plt.subplot(4, 3, 10)
plt.tight_layout()
ax1 = sns.barplot(x = firstWorldCountries['Year'], y = firstWorldCountries['Adult Mortality'], palette="rocket")
plt.xticks(rotation = 45)
Text = ax1.set(xlabel='Years', ylabel = 'Adult Mortality', title='First World Countries')
plt.subplot(4, 3, 11, sharey = ax1)
plt.tight_layout()
ax2 = sns.barplot(x = secondWorldCountries['Year'], y = secondWorldCountries['Adult Mortality'], palette="rocket")
plt.xticks(rotation = 45)
Text = ax2.set(xlabel='Years', ylabel = 'Adult Mortality', title='Second World Countries')
plt.subplot(4, 3, 12, sharey = ax1)
plt.tight_layout()
ax1 = sns.barplot(x = thirdWorldCountries['Year'], y = thirdWorldCountries['Adult Mortality'], palette="rocket")
plt.xticks(rotation = 45)
Text = ax1.set(xlabel='Years', ylabel = 'Adult Mortality', title='Third World Countries')
plt.figure(figsize=(20, 6))
plt.subplot(1, 3, 1)
plt.tight_layout()
ax1 = sns.barplot(x = FirstWCMeanLifExp['Year'], y = FirstWCMeanLifExp['LifeExpChange'], palette="vlag")
plt.xticks(rotation = 45)
Text = ax1.set(xlabel='Years', ylabel = 'Life Expectancy Change', title='First World Countries')
plt.subplot(1, 3, 2, sharey = ax1)
plt.tight_layout()
ax2 = sns.barplot(x = SecondWCMeanLifExp['Year'], y = SecondWCMeanLifExp['LifeExpChange'], palette="vlag")
plt.xticks(rotation = 45)
Text = ax2.set(xlabel='Years', ylabel = 'Life Expectancy Change', title='Second World Countries')
plt.subplot(1, 3, 3, sharey = ax1)
plt.tight_layout()
ax3 = sns.barplot(x = ThirdWCMeanLifExp['Year'], y = ThirdWCMeanLifExp['LifeExpChange'], palette="vlag")
plt.xticks(rotation = 45)
Text = ax3.set(xlabel='Years', ylabel = 'Life Expectancy Change', title='Third World Counrtries')
selectedCols = ['Life expectancy ', 'Adult Mortality', ' HIV/AIDS',
'Income composition of resources', 'Schooling',
'Sanitation Mortality/100000(2016)']
clusterData = winsorizedImputedSelectedFeats.loc[:,selectedCols]
scaler = StandardScaler()
clusterDataScaled = scaler.fit_transform(clusterData)
rangeOfClusters = range(2, 10)
randomSeedRange = range(0, 10)
scoreResults =[]
for c in rangeOfClusters:
for r in randomSeedRange:
clusterer = KMeans(n_clusters=c, random_state=r)
clusterLabels = clusterer.fit_predict(clusterDataScaled)
silhouetteAavg = silhouette_score(clusterDataScaled, clusterLabels)
scoreResults.append([c,r,silhouetteAavg])
resultFinal = pd.DataFrame(scoreResults, columns=["numOfclusters","seed","silhouetteScore"])
pivotTable = pd.pivot_table(resultFinal, index="numOfclusters", columns="seed",values="silhouetteScore")
plt.figure(figsize=(15,6))
sns.heatmap(pivotTable, annot=True, linewidths=.5, fmt='.3f')
plt.tight_layout()
kmeansSilh = KMeans(n_clusters=2, random_state=0).fit(clusterDataScaled)
labels = pd.DataFrame(kmeansSilh.labels_)
clusteredData = clusterData.assign(Cluster=labels)
clusterer = KMeans(n_clusters=3, random_state=0)
clusterLabels = clusterer.fit_predict(clusterDataScaled)
silhouetteAavg = silhouette_score(clusterDataScaled, clusterLabels)
sampleSilhouetteValues = silhouette_samples(clusterDataScaled, clusterLabels)
fig, ax1 = plt.subplots(figsize=(15,6))
yLower = 10
for i in range(2):
ith_cluster_silhouette_values = sampleSilhouetteValues[clusterLabels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
yUpper = yLower + size_cluster_i
color = cm.nipy_spectral(float(i) / 2)
ax1.fill_betweenx(np.arange(yLower, yUpper),0, ith_cluster_silhouette_values, facecolor=color,
edgecolor="black", alpha=0.7)
ax1.text(-0.05, yLower + 0.5 * size_cluster_i, str(i))
yLower = yUpper + 10
ax1.get_yaxis().set_ticks([])
ax1.set_title("The silhouette plot for various clusters")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")
ax1.axvline(x=silhouetteAavg, color="red", linestyle="--")
ax1.set_xticks([-0.2, -0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
groupedbyCluster = clusteredData.groupby(['Cluster']).mean().round(1)
groupedbyCluster
fig = plt.figure(figsize=(15, 15))
ax = fig.add_subplot(111, projection='3d')
x = clusteredData['Life expectancy ']
y = clusteredData['Schooling']
z = clusteredData['Adult Mortality']
ax.scatter(x, y, z, c = clusteredData["Cluster"], s=50, alpha=0.6, cmap="RdBu")
ax.set_xlabel('Life expectancy')
ax.set_ylabel('Schooling')
ax.set_zlabel('Adult Mortality')
ax.view_init(30, 140)
winsorizedImputedSelectedFeats = winsorizedImputedSelectedFeats.assign(Cluster=clusteredData['Cluster'].values)
X = winsorizedImputedSelectedFeats.iloc[:, 7:-1]
X.drop(X.tail(7).index,inplace=True)
Y = winsorizedImputedSelectedFeats.iloc[:,-1]
Y.drop(Y.tail(7).index,inplace=True)
seed = 42
listOfTestedModels = []
listOfTestedModels.append(('Logistic Reg', LogisticRegression()))
listOfTestedModels.append(('Linear Discr Anal', LinearDiscriminantAnalysis()))
listOfTestedModels.append(('K-NearestNeighbors', KNeighborsClassifier()))
listOfTestedModels.append(('Decision Tree', DecisionTreeClassifier()))
listOfTestedModels.append(('Naive Bayes', GaussianNB()))
listOfTestedModels.append(('Support Vector Machine', SVC()))
AccResults = []
algos = []
msg = []
for name, model in listOfTestedModels:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
CrossValResults = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring='accuracy')
AccResults.append(CrossValResults)
algos.append(name)
msg.append((name, CrossValResults.mean(), CrossValResults.std()))
MSG = pd.DataFrame(msg)
MSG.rename(columns = {MSG.columns[0] : "Algorithm's name", MSG.columns[1] : "Algorithm's score", MSG.columns[2] : "Algorithm's std"}, inplace = True)
MSG.sort_values( "Algorithm's score")
fig = plt.figure(figsize = (15, 10))
sns.set(font_scale = 1)
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(AccResults)
ax.set_xticklabels(algos)
plt.show()
X1 = winsorizedImputedSelectedFeats.iloc[:, 7:-2]
Y1 = winsorizedImputedSelectedFeats.iloc[:,-2]
scaler.fit(X1)
X1 = scaler.transform(X1)
X1_train, X1_test, Y1_train, Y1_test = train_test_split(X1, Y1, test_size=0.3, random_state=42)
svc=SVC()
svc.fit(X1_train,Y1_train)
y1Pred=svc.predict(X1_test)
print('Accuracy score of baseline model:')
print(metrics.accuracy_score(Y1_test,y1Pred))
gridParams1 = {'C': [0.1, 1, 10, 100, 1000], 'kernel': ['linear', 'rbf', 'poly']}
CrossValSVC1 = GridSearchCV(estimator=svc, param_grid=gridParams1, cv= 10)
CrossValSVC1.fit(X1_train, Y1_train)
CrossValSVC1.best_params_
CrossValSVCBest1 = SVC(C = 1000, kernel = 'rbf' )
CrossValSVCBest1.fit(X1_train, Y1_train)
predBest1 = CrossValSVCBest1.predict(X1_test)
print('Accuracy score of baseline model:')
print(metrics.accuracy_score(Y1_test,y1Pred))
print('The accuracy score for the best hyperparameters:')
print(metrics.accuracy_score(Y1_test,predBest1))
worldClassAcc = metrics.accuracy_score(Y1_test,predBest1)
confMA1 = confusion_matrix(Y1_test, predBest1)
classes1 = ['First World', 'Second World', 'Third World']
DFconfMA1 = pd.DataFrame(confMA1, index = [i for i in classes1], columns = [i for i in classes1])
plt.figure(figsize = (10,7))
sns.heatmap(DFconfMA1, annot=True, cmap='BuPu', fmt='g')
X2 = winsorizedImputedSelectedFeats.iloc[:, 7:-2]
Y2 = winsorizedImputedSelectedFeats.iloc[:,1]
Y2 = Y2.map({'Developing': 0, 'Developed': 1})
scaler.fit(X2)
X2 = scaler.transform(X2)
X2_train, X2_test, Y2_train, Y2_test = train_test_split(X2, Y2, test_size=0.3, random_state=42)
svc=SVC()
svc.fit(X2_train,Y2_train)
y2Pred=svc.predict(X2_test)
print('Accuracy score of baseline model:')
print(metrics.accuracy_score(Y2_test, y2Pred))
gridParams2 = {'C': [0.1, 1, 10, 100, 1000], 'kernel': ['linear', 'rbf', 'poly']}
CrossValSVC2 = GridSearchCV(estimator=svc, param_grid=gridParams2, cv= 10)
CrossValSVC2.fit(X2_train, Y2_train)
CrossValSVC2.best_params_
CrossValSVCBest2 = SVC(C = 100, kernel = 'rbf' )
CrossValSVCBest2.fit(X2_train, Y2_train)
predBest2 = CrossValSVCBest2.predict(X2_test)
print('Accuracy score of baseline model:')
print(metrics.accuracy_score(Y2_test,y2Pred))
print('The accuracy score for the best hyperparameters:')
print(metrics.accuracy_score(Y2_test,predBest2))
statusAcc = metrics.accuracy_score(Y2_test,predBest2)
confMA2 = confusion_matrix(Y2_test, predBest2)
classes2 = ['Developing', 'Developed']
DFconfMA2 = pd.DataFrame(confMA2, index = [i for i in classes2], columns = [i for i in classes2])
plt.figure(figsize = (10,7))
sns.heatmap(DFconfMA2, annot=True, cmap='BuPu', fmt='g')
X3 = winsorizedImputedSelectedFeats.iloc[:, 7:-2]
X3.drop(X3.tail(7).index,inplace=True)
Y3 = winsorizedImputedSelectedFeats.iloc[:,-1]
Y3.drop(Y3.tail(7).index,inplace=True)
scaler.fit(X3)
X3 = scaler.transform(X3)
X3_train, X3_test, Y3_train, Y3_test = train_test_split(X3, Y3, test_size=0.3, random_state=42)
svc=SVC()
svc.fit(X3_train,Y3_train)
y3Pred=svc.predict(X3_test)
print('Accuracy score of baseline model:')
print(metrics.accuracy_score(Y3_test, y3Pred))
gridParams3 = {'C': [0.1, 1, 10, 100, 1000], 'kernel': ['linear', 'rbf', 'poly']}
CrossValSVC3 = GridSearchCV(estimator=svc, param_grid = gridParams3, cv= 10)
CrossValSVC3.fit(X3_train, Y3_train)
CrossValSVC3.best_params_
CrossValSVCBest3 = SVC(C = 100, kernel = 'rbf' )
CrossValSVCBest3.fit(X3_train, Y3_train)
predBest3 = CrossValSVCBest3.predict(X3_test)
print('Accuracy score of baseline model:')
print(metrics.accuracy_score(Y3_test,y3Pred))
print('The accuracy score for the best hyperparameters:')
print(metrics.accuracy_score(Y3_test,predBest3))
clusterAcc = metrics.accuracy_score(Y3_test,predBest3)
confMA3 = confusion_matrix(Y3_test, predBest3)
classes3 = ['0', '1']
DFconfMA3 = pd.DataFrame(confMA3, index = [i for i in classes3], columns = [i for i in classes3])
plt.figure(figsize = (10,7))
sns.heatmap(DFconfMA3, annot=True, cmap='BuPu', fmt='g')
print('The highest accuracy was achieved for the world class prediction with an accuracy of ', worldClassAcc*100, '%')
print('The second highest accuracy was achieved for the cluster prediction with an accuracy of ', clusterAcc*100, '%')
print('The third highest accuracy was achieved for the status prediction with an accuracy of ', statusAcc*100, '%')